Jupyter Notebook of the Iris Flower Dataset
Notebook is by Solomon Sonya 0xSolomonSonya
Some code and data cells in this notebook may have been augmented from ChatGPT, Copilot, Gemini, other Generative AI models, and online resources.
We will model our data in Phases:¶
- Phase 1: Prepare and Clean the Dataset.
- Phase 2: Explore the Dataset
- Phase 3: Modeling & Classification
- Phase 4: Model Evaluation
- Phase 5: Model Selection & Deployment
Data Wrangling:¶
- Augmentation (add new row or column)
- Subsetting (Filter based on condition)
- Cleaning (drop_na, fill_na, imputation, outliers)
- Aggregating (Groupby.mean())
- Transforming (Scale, Standardize, Normalize)
Phase 1: Prepare and Clean the Data
imports¶
from sklearn.datasets import load_iris
from sklearn import datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statistics as stat
import scipy.stats as st
import scipy as sp
import os
#import scikitplot as skplt
import datetime
from tabulate import tabulate
import sklearn
import plotly.graph_objs as go
import ipywidgets as widgets
import math
import statsmodels
import warnings
import io
import inspect
import sys
import traceback
from scipy.stats.mstats import winsorize
from IPython.display import display, HTML
from sklearn.preprocessing import RobustScaler
#scale the data via z-score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
#preprocesing
#learning and prediction algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier, Perceptron
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV
from statsmodels.graphics.gofplots import ProbPlot
from statsmodels.formula.api import ols
import statsmodels.api as sm
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.linear_model import Lasso, LassoCV, RidgeCV, ElasticNetCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVR
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder # Optional for categorical labels
from sklearn.metrics import classification_report
from sklearn.inspection import permutation_importance
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.linear_model import RidgeClassifier
# models
from sklearn import ensemble
from sklearn import linear_model
from sklearn import naive_bayes
from sklearn import tree
from sklearn import gaussian_process
from sklearn import neighbors
from sklearn import svm
from xgboost import XGBRegressor
from sklearn import discriminant_analysis
from sklearn import neural_network
from sklearn import calibration
#from lightgbm import LGBMClassifier
#from catboost import CatBoostClassifier
# save and import trained models
import pickle
# Deep Learning
#import tensorflow as tf
#from tensorflow import keras
#from sklearn.datasets import make_classification
# PCA
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
#importing [Bagging]
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
#importing [Boosting]
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
import xgboost
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
# model tuning
from sklearn.model_selection import GridSearchCV
# evaluation metrics
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, r2_score
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve
from sklearn.metrics import make_scorer,mean_squared_error, r2_score, mean_absolute_error
from sklearn.metrics import median_absolute_error, mean_absolute_percentage_error, rand_score
from sklearn.metrics import jaccard_score, dcg_score, consensus_score, d2_absolute_error_score
from sklearn.metrics import d2_pinball_score, d2_tweedie_score, davies_bouldin_score
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import auc, precision_recall_curve
from sklearn.metrics import cohen_kappa_score
from kneed import KneeLocator
from sklearn.metrics import silhouette_score
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
import numpy as np
from sklearn.linear_model import Perceptron
from sklearn.model_selection import KFold
#update system
#!pip install --upgrade scikit-learn
#!pip install xgboost
#model tuning
%matplotlib inline
#to ignore warnings
warnings.filterwarnings("ignore")
print("imports complete.")
imports complete.
#pip install kneed
image source: https://peaceadegbite1.medium.com/iris-flower-classification-60790e9718a1, Iris Flower Classification, Peace Ikeoluwa Adegbite, Retrieved: 2025-02-04
load dataset¶
iris = load_iris()
df_iris = pd.DataFrame(data=iris.data, columns=iris.feature_names)
view dataframe¶
df_iris
| sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | |
|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 |
| ... | ... | ... | ... | ... |
| 145 | 6.7 | 3.0 | 5.2 | 2.3 |
| 146 | 6.3 | 2.5 | 5.0 | 1.9 |
| 147 | 6.5 | 3.0 | 5.2 | 2.0 |
| 148 | 6.2 | 3.4 | 5.4 | 2.3 |
| 149 | 5.9 | 3.0 | 5.1 | 1.8 |
150 rows × 4 columns
count number of values that are negative¶
negative_mask = df_iris < 0
total_negative_values = negative_mask.sum().sum()
print("Total number of negative values:", total_negative_values)
Total number of negative values: 0
view columns¶
print(list(df_iris.columns))
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
rename columns, eliminate whitespaces¶
# old name: new name
col_rename = {'sepal length (cm)': 'sepal_length',
'sepal width (cm)' : 'sepal_width',
'petal length (cm)': 'petal_length',
'petal width (cm)' : 'petal_width' }
# df_iris = df_iris.rename(columns=col_rename)
# or
df_iris.rename(columns=col_rename, inplace=True)
view first 6 instances in dataframe¶
df_iris.head(6)
| sepal_length | sepal_width | petal_length | petal_width | |
|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 |
| 5 | 5.4 | 3.9 | 1.7 | 0.4 |
augment classLabel to the dataframe¶
df_iris['class_label'] = iris.target
sample 4 random instances (without replacement)¶
df_iris.sample(4)
| sepal_length | sepal_width | petal_length | petal_width | class_label | |
|---|---|---|---|---|---|
| 81 | 5.5 | 2.4 | 3.7 | 1.0 | 1 |
| 8 | 4.4 | 2.9 | 1.4 | 0.2 | 0 |
| 128 | 6.4 | 2.8 | 5.6 | 2.1 | 2 |
| 97 | 6.2 | 2.9 | 4.3 | 1.3 | 1 |
create mapping & augment with flower category name¶
map_col_name = { 0: 'Iris-setosa',
1: 'Iris-versicolor',
2: 'Iris-virginica'}
df_iris['category'] = df_iris['class_label'].apply(lambda x: map_col_name[x])
sample 10 instances (with replacement)¶
df_iris.sample(10, replace=True)
| sepal_length | sepal_width | petal_length | petal_width | class_label | category | |
|---|---|---|---|---|---|---|
| 114 | 5.8 | 2.8 | 5.1 | 2.4 | 2 | Iris-virginica |
| 79 | 5.7 | 2.6 | 3.5 | 1.0 | 1 | Iris-versicolor |
| 125 | 7.2 | 3.2 | 6.0 | 1.8 | 2 | Iris-virginica |
| 103 | 6.3 | 2.9 | 5.6 | 1.8 | 2 | Iris-virginica |
| 84 | 5.4 | 3.0 | 4.5 | 1.5 | 1 | Iris-versicolor |
| 103 | 6.3 | 2.9 | 5.6 | 1.8 | 2 | Iris-virginica |
| 133 | 6.3 | 2.8 | 5.1 | 1.5 | 2 | Iris-virginica |
| 96 | 5.7 | 2.9 | 4.2 | 1.3 | 1 | Iris-versicolor |
| 65 | 6.7 | 3.1 | 4.4 | 1.4 | 1 | Iris-versicolor |
| 70 | 5.9 | 3.2 | 4.8 | 1.8 | 1 | Iris-versicolor |
save the full dataset¶
# make directory
os.makedirs('./data', exist_ok=True)
# save data
df_iris.to_csv('./data/iris.csv', sep=',', index=False)
Phase 2: Explore the dataset
review the dataset¶
df_iris.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 150 entries, 0 to 149 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sepal_length 150 non-null float64 1 sepal_width 150 non-null float64 2 petal_length 150 non-null float64 3 petal_width 150 non-null float64 4 class_label 150 non-null int64 5 category 150 non-null object dtypes: float64(4), int64(1), object(1) memory usage: 7.2+ KB
# ensure index is within range of expected number of instances
df_iris.index
RangeIndex(start=0, stop=150, step=1)
# view shape in format (rows, cols) ==> rows == # of instances, cols == # of features
df_iris.shape
(150, 6)
# aggregaion function to summarize number values within the dataset
# we use .T for transpose (i.e., swap rows with cols)
df_iris.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| sepal_length | 150.0 | 5.843333 | 0.828066 | 4.3 | 5.1 | 5.80 | 6.4 | 7.9 |
| sepal_width | 150.0 | 3.057333 | 0.435866 | 2.0 | 2.8 | 3.00 | 3.3 | 4.4 |
| petal_length | 150.0 | 3.758000 | 1.765298 | 1.0 | 1.6 | 4.35 | 5.1 | 6.9 |
| petal_width | 150.0 | 1.199333 | 0.762238 | 0.1 | 0.3 | 1.30 | 1.8 | 2.5 |
| class_label | 150.0 | 1.000000 | 0.819232 | 0.0 | 0.0 | 1.00 | 2.0 | 2.0 |
'pretty-print' of dataframe¶
# this can be a very useful function to print contents of dataframe when the default printing scheme doesn't display the df in an easy to read manner
def print_df(dataframe):
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(tabulate(dataframe, headers='keys', tablefmt='pretty'))
print_df(df_iris.describe().T)
+--------------+-------+--------------------+--------------------+-----+-----+------+-----+-----+ | | count | mean | std | min | 25% | 50% | 75% | max | +--------------+-------+--------------------+--------------------+-----+-----+------+-----+-----+ | sepal_length | 150.0 | 5.843333333333334 | 0.8280661279778629 | 4.3 | 5.1 | 5.8 | 6.4 | 7.9 | | sepal_width | 150.0 | 3.0573333333333337 | 0.435866284936698 | 2.0 | 2.8 | 3.0 | 3.3 | 4.4 | | petal_length | 150.0 | 3.7580000000000005 | 1.7652982332594667 | 1.0 | 1.6 | 4.35 | 5.1 | 6.9 | | petal_width | 150.0 | 1.1993333333333336 | 0.7622376689603465 | 0.1 | 0.3 | 1.3 | 1.8 | 2.5 | | class_label | 150.0 | 1.0 | 0.8192319205190405 | 0.0 | 0.0 | 1.0 | 2.0 | 2.0 | +--------------+-------+--------------------+--------------------+-----+-----+------+-----+-----+
view unique categories¶
print(df_iris['category'].unique())
['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']
view how many instances exist for each species category¶
df_iris['category'].value_counts()
category Iris-setosa 50 Iris-versicolor 50 Iris-virginica 50 Name: count, dtype: int64
count number of empty/null instances¶
df_iris.isnull().sum()
sepal_length 0 sepal_width 0 petal_length 0 petal_width 0 class_label 0 category 0 dtype: int64
filter df for specific species (using dot notation)¶
df_iris[df_iris['category'] == 'Iris-versicolor'].tail(5)
| sepal_length | sepal_width | petal_length | petal_width | class_label | category | |
|---|---|---|---|---|---|---|
| 95 | 5.7 | 3.0 | 4.2 | 1.2 | 1 | Iris-versicolor |
| 96 | 5.7 | 2.9 | 4.2 | 1.3 | 1 | Iris-versicolor |
| 97 | 6.2 | 2.9 | 4.3 | 1.3 | 1 | Iris-versicolor |
| 98 | 5.1 | 2.5 | 3.0 | 1.1 | 1 | Iris-versicolor |
| 99 | 5.7 | 2.8 | 4.1 | 1.3 | 1 | Iris-versicolor |
# df_pairplot = df_iris[['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'category']] # we'll do this later :-)
alternatively filter df for specific species (using dot notation)¶
df_iris[df_iris.category == 'Iris-versicolor'].tail(5)
| sepal_length | sepal_width | petal_length | petal_width | class_label | category | |
|---|---|---|---|---|---|---|
| 95 | 5.7 | 3.0 | 4.2 | 1.2 | 1 | Iris-versicolor |
| 96 | 5.7 | 2.9 | 4.2 | 1.3 | 1 | Iris-versicolor |
| 97 | 6.2 | 2.9 | 4.3 | 1.3 | 1 | Iris-versicolor |
| 98 | 5.1 | 2.5 | 3.0 | 1.1 | 1 | Iris-versicolor |
| 99 | 5.7 | 2.8 | 4.1 | 1.3 | 1 | Iris-versicolor |
df_iris
| sepal_length | sepal_width | petal_length | petal_width | class_label | category | |
|---|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | 0 | Iris-setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | 0 | Iris-setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | 0 | Iris-setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | 0 | Iris-setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | 0 | Iris-setosa |
| ... | ... | ... | ... | ... | ... | ... |
| 145 | 6.7 | 3.0 | 5.2 | 2.3 | 2 | Iris-virginica |
| 146 | 6.3 | 2.5 | 5.0 | 1.9 | 2 | Iris-virginica |
| 147 | 6.5 | 3.0 | 5.2 | 2.0 | 2 | Iris-virginica |
| 148 | 6.2 | 3.4 | 5.4 | 2.3 | 2 | Iris-virginica |
| 149 | 5.9 | 3.0 | 5.1 | 1.8 | 2 | Iris-virginica |
150 rows × 6 columns
VISUALIZE THE DATA
univariate analysis
boxplots¶
# single attribute - boxplot vertical is the default
df_iris['sepal_length'].plot(kind="box");
# display boxplot horizontally
df_iris['sepal_length'].plot(kind="box", vert=False);
adjust image size¶
fig, ax = plt.subplots(figsize=(10, 4)) # Width = 10 inches, Height = 4 inches
# Create the horizontal box plot on the specified axes
df_iris['sepal_length'].plot(kind="box", vert=False, ax=ax)
# Optional: Add a title
ax.set_title('Sepal Length Distribution')
# Optional: Customize x-axis label (since it's horizontal)
ax.set_xlabel('Sepal Length (cm)')
plt.show()
# combine single boxplots
df_iris.drop('class_label', axis=1).plot(kind="box");
#df_iris.boxplot()
df_iris.boxplot(column='sepal_length', by='category');
df_iris.drop('class_label', axis=1).boxplot(by='category', figsize=(10,15), sharex=False, sharey=False);
plt.subplots_adjust(hspace=0.25)
Observation Petal Length and Petal Width may be better discriminators as we can start to see where decision boundaries could be
#sns.boxplot(data=df_iris, x='category', y='sepal_length')
kde¶
df_iris.drop('class_label', axis=1).plot(kind='kde');
# more plots to try later
# df_iris.plot(kind=?)
#line, bar, barh, hist, box, kde, density, area, pie, scatter, hexbin, etc
# specify list of features (aka attributes)
lst_attributes = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
sns.kdeplot(data=df_iris[lst_attributes], fill=True);
histogram¶
df_iris['sepal_length'].hist(edgecolor='black');
# adjust figure
df_iris['sepal_length'].hist(alpha=0.7, rwidth=0.93, edgecolor='black');
# kde plots an estimate of the probability density funciton of a continuous random variable - we can use this to visualize the desnsity
sns.kdeplot(df_iris['sepal_length'], fill=True);
plot hist and smoothed kde¶
df_iris['sepal_length'].hist(alpha=0.7, rwidth=0.9, edgecolor='black', density=True);
sns.kdeplot(df_iris['sepal_length'], color='red', fill=False);
add median and mean¶
if mean > median: i.e., mean is to the RIGHT of median --> positive skew == right-skewed
if mean < median: i.e., mean is to the LEFT of median --> negative skew == left-skewed
if Mean ≈ Median: i.e., mean closely equal to median --> data is nearly symmetric == normal distribution
mean_sepal_len = df_iris['sepal_length'].mean()
median_sepal_len = df_iris['sepal_length'].median()
# Create a histogram and KDE plot
fig, ax = plt.subplots()
ax.hist(df_iris['sepal_length'], bins=10, alpha=0.7, rwidth=0.9, edgecolor='black', density=True)
sns.kdeplot(df_iris['sepal_length'], color='red', fill=False);
# Add mean and median line
ax.axvline(mean_sepal_len, color='orange', linestyle='--', label=f'Mean: {mean_sepal_len:.2f}')
ax.axvline(median_sepal_len, color='green', linestyle='-', label=f'Median: {median_sepal_len:.2f}')
ax.legend()
plt.show();
iterate through all attributes and plot histogram on subplots¶
# create 2X2 grid for subplots
#fig, axes = plt.subplots(2, 2, figsize=(10, 8), sharex=True, sharey=True)
fig, axes = plt.subplots(2, 2, figsize=(10, 8)) # uncomment above to share same x and y scale
# flatten subplot for iteration
axes = axes.flatten()
# specify list of features (aka attributes)
#lst_attributes = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
# plot histogram in each subplot
for i, attr in enumerate(lst_attributes):
axes[i].hist(df_iris[attr], bins=10, edgecolor='black', alpha=0.7)
axes[i].set_title(f'Histogram of {attr}')
axes[i].set_xlabel(attr)
axes[i].set_ylabel('Frequency')
axes[i].tick_params(axis='both', which='both', labelsize=10)
plt.tight_layout()
plt.show()
add kde, mean, and median¶
# create 2X2 grid for subplots
# uncomment out below if you wish to share x and y data points
#fig, axes = plt.subplots(2, 2, figsize=(15, 8), sharex=True, sharey=True)
fig, axes = plt.subplots(2, 2, figsize=(10, 8), sharex=False, sharey=False)
# flatten subplot for iteration
axes = axes.flatten()
# specify list of features (aka attributes)
lst_attributes = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
# plot histogram in each subplot
for i, attr in enumerate(lst_attributes):
axes[i].hist(df_iris[attr], bins=10, edgecolor='black', alpha=0.7, density=True)
sns.kdeplot(df_iris[attr], ax=axes[i], color='red', linewidth=2, alpha=0.8)
axes[i].set_title(f'Histogram of {attr}')
axes[i].set_xlabel(attr)
axes[i].set_ylabel('Density')
axes[i].tick_params(axis='both', which='both', labelsize=10)
# add mean and median line
mean_attr = df_iris[attr].mean()
median_attr = df_iris[attr].median()
axes[i].axvline(mean_attr, color='orange', linestyle='--', label=f'Mean: {mean_attr:.2f}')
axes[i].axvline(median_attr, color='green', linestyle='-', label=f'Median: {median_attr:.2f}')
axes[i].legend()
plt.tight_layout()
plt.show()
# create 2X2 grid for subplots
fig, axes = plt.subplots(2, 2, figsize=(10, 8), sharex=False, sharey=False)
# flatten subplot for iteration
axes = axes.flatten()
# specify list of features (aka attributes)
lst_attributes = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
# plot histogram in each subplot
for i, attr in enumerate(lst_attributes):
sns.histplot(df_iris[attr], ax=axes[i], kde=True) #kde=True adds the line
#or
#sns.histplot(df_iris[attr], ax=axes[i], color="skyblue") # Histogram color
#sns.kdeplot(df_iris[attr], ax=axes[i], color="red") # KDE line color
axes[i].set_title(f'Histogram of {attr}')
axes[i].set_xlabel(attr)
axes[i].set_ylabel('Frequency')
axes[i].tick_params(axis='both', which='both', labelsize=10)
# add mean and median line
mean_attr = df_iris[attr].mean()
median_attr = df_iris[attr].median()
axes[i].axvline(mean_attr, color='orange', linestyle='--', label=f'Mean: {mean_attr:.2f}')
axes[i].axvline(median_attr, color='green', linestyle='-', label=f'Median: {median_attr:.2f}')
axes[i].legend()
plt.tight_layout()
plt.show()
bivariate analysis
df_iris.plot(kind="scatter", x='sepal_length', y='sepal_width');
# show univariate histogram and bivariate scatterplots in the same plot
sns.jointplot(data=df_iris, x='sepal_length', y='sepal_width')
<seaborn.axisgrid.JointGrid at 0x7f675ddd3690>
add hue to better distinguish classes¶
sns.scatterplot(data=df_iris, x='sepal_length', y='sepal_width', hue='category');
sns.jointplot(data=df_iris, x='sepal_length', y='sepal_width', hue='category');
# linear model plot --> scatter plot with a regression line
# shaded region represents the 95% confidence interval (which is directly related to standard error of measurement)
sns.lmplot(data=df_iris, x='sepal_length', y='sepal_width', hue='category');
let's use an aggregation function to look at the correlation of features¶
correlation_matrix = df_iris.corr(numeric_only=True)
correlation_matrix
| sepal_length | sepal_width | petal_length | petal_width | class_label | |
|---|---|---|---|---|---|
| sepal_length | 1.000000 | -0.117570 | 0.871754 | 0.817941 | 0.782561 |
| sepal_width | -0.117570 | 1.000000 | -0.428440 | -0.366126 | -0.426658 |
| petal_length | 0.871754 | -0.428440 | 1.000000 | 0.962865 | 0.949035 |
| petal_width | 0.817941 | -0.366126 | 0.962865 | 1.000000 | 0.956547 |
| class_label | 0.782561 | -0.426658 | 0.949035 | 0.956547 | 1.000000 |
visualize correlation via heatmap¶
plt.figure(figsize=(8, 6)) # Adjust width and height as needed
sns.heatmap(correlation_matrix, annot=True, cmap='PuBu');
# bivariate analysis on pair of features with a strong correlation
sns.scatterplot(data=df_iris, x='petal_length', y='petal_width', hue='category', edgecolor='gray');
shows relationship between sepal_length numerical value is distributed across different species categories¶
# unlike scatter plots, swarmplot does not let the same values overlap to help visualize the distribution
# this chart helps us visualize datapoints of greatest concentration
sns.boxplot(data=df_iris, x='category', y='sepal_length');
sns.swarmplot(data=df_iris, x='category', y='sepal_length');
View centrality of the data¶
# Create a 2x2 subplot layout
fig, axes = plt.subplots(2, 2, figsize=(14, 14))
axes = axes.flatten()
# Plot boxplot and swarmplot for each feature
for i, feature in enumerate(lst_attributes):
sns.boxplot(ax=axes[i], data=df_iris, x='category', y=feature)
sns.swarmplot(ax=axes[i], data=df_iris, x='category', y=feature)
axes[i].set_title(f'{feature.capitalize()} by Category')
axes[i].set_xlabel('Category')
axes[i].set_ylabel(feature)
plt.subplots_adjust(wspace=0.3, hspace=0.4)
#plt.tight_layout()
plt.show()
#fig, axes = plt.subplots(2, 2, figsize=(14, 14))
plt.figure(figsize=(14,14))
plt.subplot(2,2,1)
sns.violinplot(x='category',y='sepal_length',data=df_iris)
sns.swarmplot(x='category', y='sepal_length', data=df_iris)
plt.subplot(2,2,2)
sns.violinplot(x='category',y='sepal_width',data=df_iris)
sns.swarmplot(x='category', y='sepal_width', data=df_iris)
plt.subplot(2,2,3)
sns.violinplot(x='category',y='petal_length',data=df_iris)
sns.swarmplot(x='category', y='petal_length', data=df_iris)
plt.subplot(2,2,4)
sns.violinplot(x='category',y='petal_width',data=df_iris);
sns.swarmplot(x='category', y='petal_width', data=df_iris);
Filter dataframe by class and Plot historgram for each class¶
# Get the unique classes
unique_classes = df_iris['category'].unique()
lst_drop_features = ['category', 'class_label']
# Set up the figure with subplots
plt.figure(figsize=(16, 20)) # Adjusted size to fit all subplots
for i in range(len(unique_classes)):
# Get label for the current class
label = unique_classes[i]
# Filter the data for the current class
df_label = df_iris[df_iris['category'] == label].drop(lst_drop_features, axis=1)
# Calculate the correlation matrix
correlation_matrix = df_label.corr()
# Plot the heatmap
plt.subplot(len(unique_classes), 2, 2 * i + 1)
sns.heatmap(correlation_matrix, annot=True, cmap='PuBu', cbar=True)
plt.title(f'Correlation Matrix for Category {label}')
# Plot the boxplot with swarmplot overlay
plt.subplot(len(unique_classes), 2, 2 * i + 2)
sns.boxplot(data=df_label, ax=plt.gca(), palette="Blues")
for col in df_label.columns:
sns.swarmplot(x=[col]*len(df_label), y=df_label[col], ax=plt.gca(), edgecolor='black', color='red')
plt.title(f'Boxplots with Swarm for Category {label}')
# Adjust layout to add space between rows
plt.subplots_adjust(hspace=0.2)
#plt.tight_layout()
plt.show()
bivariate analysis via pairplot¶
#<br> !!! Beware!!! too many features may end up taking significant time to process.
# filter specific features
df_pairplot = df_iris[['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'category']]
%%time
sns.pairplot(df_pairplot, diag_kind='kde', hue='category');
CPU times: user 2.47 s, sys: 0 ns, total: 2.47 s Wall time: 3.3 s
Bivariate analysis with histogram across the diagonal¶
%%time
sns.pairplot(df_pairplot, diag_kind='hist', hue='category');
CPU times: user 2.68 s, sys: 11.4 ms, total: 2.69 s Wall time: 3.55 s
View the Mean and Standard Deviaiton of each feature per class¶
# Calculate the mean and standard deviation for each feature grouped by category
mean_values = df_pairplot.groupby('category').mean()
std_values = df_pairplot.groupby('category').std()
# Set up the plot
plt.figure(figsize=(12, 8))
# Plotting the mean and standard deviation for each feature across the x-axis
for i, category in enumerate(mean_values.index):
plt.errorbar(mean_values.columns, mean_values.iloc[i], yerr=std_values.iloc[i],
label=f'{category}', marker='o', capsize=5)
# Customizing the plot
plt.xlabel('Features')
plt.ylabel('Value')
plt.title('Mean and Standard Deviation for Each Category')
plt.legend(title='Species')
plt.grid(True)
plt.show()
Plot same data, fill in standard deviation¶
# Calculate the mean and standard deviation for each feature grouped by category
mean_values = df_pairplot.groupby('category').mean()
std_values = df_pairplot.groupby('category').std()
# Set up the plot
plt.figure(figsize=(12, 8))
# Plotting the mean and filling the standard deviation area for each category
for i, category in enumerate(mean_values.index):
plt.plot(mean_values.columns, mean_values.iloc[i], marker='o', label=f'{category}')
plt.fill_between(mean_values.columns,
mean_values.iloc[i] - std_values.iloc[i],
mean_values.iloc[i] + std_values.iloc[i],
alpha=0.2) # Alpha controls the transparency of the fill
# Customizing the plot
plt.xlabel('Features')
plt.ylabel('Value')
plt.title('Mean and Standard Deviation for Each Category')
plt.legend(title='Species')
plt.grid(True)
plt.show()
Combine both plots to toggle when to show error bar range¶
plot_error_bar = True
plot_std_error = False
# Calculate the mean and standard deviation for each feature grouped by category
mean_values = df_pairplot.groupby('category').mean()
std_values = df_pairplot.groupby('category').std()
std_err_values = std_values / np.sqrt(df_pairplot.groupby('category').count())
# determine which bar to plot
lst_error_bar = std_values
if plot_std_error:
lst_error_bar = std_err_values
# Set up the plot
plt.figure(figsize=(12, 8))
# Plotting the mean and filling the standard deviation area for each category
for i, category in enumerate(mean_values.index):
if plot_error_bar:
plt.errorbar(mean_values.columns, mean_values.iloc[i], yerr=lst_error_bar.iloc[i],
label=f'{category}', marker='o', capsize=5)
else:
plt.plot(mean_values.columns, mean_values.iloc[i], marker='o', label=f'{category}')
plt.fill_between(mean_values.columns,
mean_values.iloc[i] - lst_error_bar.iloc[i],
mean_values.iloc[i] + lst_error_bar.iloc[i],
alpha=0.2) # Alpha controls the transparency of the fill
# Customizing the plot
plt.xlabel('Features')
plt.ylabel('Value')
plt.title('Mean and Standard Deviation for Each Category')
plt.legend(title='Species')
plt.grid(True)
plt.show()
much more time is spent in EDA. You can view more chart ideas:¶
Helper Functions
Display Confusion Matrix¶
def display_confusion_matrix(model_NAME, conf_mtx, lst_class_labels):
os.makedirs('./confusion_matrix', exist_ok=True)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_mtx, annot=True, fmt='d', cmap='Blues', cbar=False, xticklabels=lst_unique_class_names, yticklabels=lst_unique_class_names)
plt.title(model_NAME + ' Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
# Save the figure
plt.savefig('confusion_matrix/confusion_matrix_' + str(model_NAME) + '.png', dpi=300, bbox_inches='tight')
# Show the plot
plt.show()
Model Learning Curve¶
def model_learning_curve(model_name, X, y, num_trials, test_size_override, alpha, show_learning_curve):
# set lerning curves to compute at varying amounts of training data
#lst_training_size = [0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80]
lst_training_proportion = [0.05, 0.10, 0.20, 0.40, 0.60, 0.80]
# notify
print(f'commencing evaluation of model [{model_name}]')
# set test size
testSize = test_size_override
if testSize < .05 or testSize > .99:
testSize = 0.1
# set num trials
numTrials = num_trials
if numTrials < 1:
numTrials = 1
# create list to store eval scores ffor training and test sets
lst_eval_scores_TRAINING_SET = []
lst_std_dev_TRAINING_SET = []
lst_std_err_TRAINING_SET = []
lst_eval_scores_TEST_SET = []
lst_std_dev_TEST_SET = []
lst_std_err_TEST_SET = []
continue_modeling = True
# outter loop: training size
for i in range(len(lst_training_proportion)):
# set training size percentage
trainSize = lst_training_proportion[i]
# init eval scores at this percentage (index)
total_eval_score_TRAINING = 0
total_eval_score_TEST = 0
lst_training_scores_for_this_proportion = []
lst_test_scores_for_this_proportion = []
if not continue_modeling:
break
# iterate num trails, store average eval score
for j in range(numTrials):
# establish training/testing hold-out
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=testSize,train_size=trainSize,shuffle=True)
# initialize model
################################################################################
# General Linear Models
################################################################################
if model_name == 'logistic_regression':
mdl = LogisticRegression() # use default parameters
elif model_name == 'perceptron':
mdl = linear_model.Perceptron()
elif model_name == 'sgd_classifier':
mdl = SGDClassifier()
elif model_name == 'passive_aggressive':
mdl = linear_model.PassiveAggressiveClassifier()
elif model_name == 'ridge_classifier':
mdl = RidgeClassifier()
################################################################################
# SVM
################################################################################
elif model_name == 'linear_svc':
mdl = svm.LinearSVC()
elif model_name == 'svc':
mdl = svm.SVC(probability=True, kernel='linear')
#elif model_name == 'svc_nu':
# mdl = svm.NuSVC(probability=True, kernel='linear')
# tree
elif model_name == 'decision_tree':
mdl = DecisionTreeClassifier()
################################################################################
# ensemble
################################################################################
elif model_name == 'random_forest':
mdl = RandomForestClassifier() # random_state=42
elif model_name == 'extra_tree':
mdl = ExtraTreesClassifier()
elif model_name == 'bagging_classifier':
mdl = BaggingClassifier()
elif model_name == 'gradient_boosting':
mdl = ensemble.GradientBoostingClassifier() # random_state=42
elif model_name == 'ada_boosting':
mdl = ensemble.AdaBoostClassifier()
elif model_name == 'hist_boosting':
mdl = HistGradientBoostingClassifier()
################################################################################
# Gaussian Processes
################################################################################
elif model_name == 'gaussian_process':
mdl = gaussian_process.GaussianProcessClassifier()
################################################################################
# NBC probabilistic
################################################################################
elif model_name == 'nbc_gaussian':
mdl = GaussianNB()
elif model_name == 'nbc_bernoulli':
mdl = naive_bayes.BernoulliNB()
################################################################################
# neighbors
################################################################################
elif model_name == 'knn':
mdl = KNeighborsClassifier()
################################################################################
# discriminant analysis
################################################################################
#elif model_name == 'quadratic_discriminant':
# mdl = QuadraticDiscriminantAnalysis()
elif model_name == 'linear_discriminant':
mdl = LinearDiscriminantAnalysis()
################################################################################
# neural network
################################################################################
elif model_name == 'mlp':
mdl = MLPClassifier(hidden_layer_sizes=(100), max_iter=500)
################################################################################
# ELSE
################################################################################
else:
print(f'\n\n ERROR! I am not configured to test model {model_name}\n\n')
continue_modeling = False
break
# validate modeling
if not continue_modeling:
continue
################################################################################
# train the model to learn the training set
################################################################################
mdl.fit(X_train, y_train)
# use score helper function to apply trained model to the test set and return accuracy. then accumulate model eval performance
eval_score_TEST = mdl.score(X_test,y_test)
# store specific score
lst_test_scores_for_this_proportion.append(eval_score_TEST)
# accumulate total for the mean
total_eval_score_TEST = total_eval_score_TEST + eval_score_TEST
eval_score_TRAINING = mdl.score(X_train,y_train)
# store specific score for this num trial
lst_training_scores_for_this_proportion.append(eval_score_TRAINING)
# accumulate total for the mean
total_eval_score_TRAINING = total_eval_score_TRAINING + eval_score_TRAINING
#############################################################################################
# done with num trials (inner loop) at this training size, store average evaluation score
############################################################################################
if continue_modeling:
lst_eval_scores_TEST_SET.append(total_eval_score_TEST/numTrials)
lst_eval_scores_TRAINING_SET.append(total_eval_score_TRAINING/numTrials)
# calculate std dev
std_dev_training = np.std(lst_training_scores_for_this_proportion, ddof=1)
std_dev_test = np.std(lst_test_scores_for_this_proportion, ddof=1)
# store std dev
lst_std_dev_TRAINING_SET.append(std_dev_training)
lst_std_dev_TEST_SET.append(std_dev_test)
# std err
lst_std_err_TRAINING_SET.append(std_dev_training / np.sqrt(len(lst_std_dev_TRAINING_SET))) # len(lst_std_dev_TRAINING_SET) should == numtrials
lst_std_err_TEST_SET.append(std_dev_test / np.sqrt(len(lst_std_dev_TEST_SET))) # len(lst_std_dev_TEST_SET) should == numtrials
###############################################################################################
# done with outter loop
###############################################################################################
if continue_modeling:
# calculate score ratio as a way of quantitatively determine overfitting
lst_score_ratio = np.array(lst_eval_scores_TRAINING_SET) / np.array(lst_eval_scores_TEST_SET)
lst_score_ratio
# we use paired t-test becauses the underlying data is similar
t_statistic, p_value = st.ttest_rel(lst_eval_scores_TRAINING_SET, lst_eval_scores_TEST_SET)
disposition = 'fail to reject null hypothesis; observed difference between 2 samples are not statistically significant'
if p_value < alpha:
disposition = ' reject null hypothesis; observed difference between 2 samples are statistically significant'
# plot learning curves
if show_learning_curve:
plt.figure(figsize=(10, 6))
plt.plot(lst_training_proportion,lst_eval_scores_TEST_SET, label='Test Set', marker='o')
plt.plot(lst_training_proportion,lst_eval_scores_TRAINING_SET, label='Training Set', marker='x')
plt.xlabel('Training Size')
plt.ylabel('Evaluation Score')
plt.title('Learning Curve for model ' + str(model_name))
plt.legend()
plt.grid(True)
plt.show();
# notify
print(f"\nTraining complete for model {model_name}!")
try:
# Format values to 4 decimal places
# Round values to 4 decimal places
rounded_training_set = [round(score, 4) for score in lst_eval_scores_TRAINING_SET]
rounded_test_set = [round(score, 4) for score in lst_eval_scores_TEST_SET]
rounded_std_dev_train_set = [round(score, 4) for score in lst_std_dev_TRAINING_SET]
rounded_std_err_train_set = [round(score, 4) for score in lst_std_err_TRAINING_SET]
rounded_std_dev_test_set = [round(score, 4) for score in lst_std_dev_TEST_SET]
rounded_std_err_test_set = [round(score, 4) for score in lst_std_err_TEST_SET]
rounded_score_ratio = [round(score, 4) for score in lst_score_ratio]
rounded_p_value = round(p_value, 4)
# Print rounded lists
print("Training set:", rounded_training_set)
print("Test set: ", rounded_test_set)
print('------------------------------------')
print("Train stddev:", rounded_std_dev_train_set)
print("Train stderr:", rounded_std_err_train_set)
print("Test stddev:", rounded_std_dev_test_set)
print("Test stderr:", rounded_std_err_test_set)
print('------------------------------------')
print("Score Ratio: ", rounded_score_ratio)
print("Trn/Tst p-value:", rounded_p_value)
print('p-value disposition:', disposition)
print('=========================================================================================================================================\n')
except:
print("Training set:", lst_eval_scores_TRAINING_SET)
print("Test set: ", lst_eval_scores_TEST_SET)
print('------------------------------------')
print("Train stddev:", lst_std_dev_TRAINING_SET)
print("Train stderr:", lst_std_err_TRAINING_SET)
print("Test stddev:", lst_std_dev_TEST_SET)
print("Test stderr:", lst_std_err_TEST_SET)
print('------------------------------------')
print("Score Ratio: ", lst_score_ratio)
print("Trn/Tst p-value:", p_value)
print('p-value disposition:', disposition)
print('=========================================================================================================================================\n')
return lst_training_proportion, lst_eval_scores_TRAINING_SET, lst_std_dev_TRAINING_SET, lst_std_err_TRAINING_SET, lst_eval_scores_TEST_SET, lst_std_dev_TEST_SET, lst_std_err_TEST_SET, lst_score_ratio, p_value, disposition
#lst_training_proportion, lst_eval_scores_TRAINING_SET, lst_std_dev_TRAINING_SET, lst_std_err_TRAINING_SET, lst_eval_scores_TEST_SET, lst_std_dev_TEST_SET, lst_std_err_TEST_SET, lst_score_ratio, p_value, disposition = model_learning_curve('logistic_regression', X, y, 5, 0.1, 0.05, True)
Phase 3: Model & Classify the Data
Create target variable label and X features¶
y = df_iris.class_label
X = df_iris[['sepal_length','sepal_width','petal_length','petal_width']]
y_class_names = df_iris['category']
y = y_class_names
y_label_index = iris.target
lst_unique_class_names = list(iris.target_names)
View Matrix X and array (list) y values¶
# X is the matrix representing our entire features and instances
X
| sepal_length | sepal_width | petal_length | petal_width | |
|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 |
| ... | ... | ... | ... | ... |
| 145 | 6.7 | 3.0 | 5.2 | 2.3 |
| 146 | 6.3 | 2.5 | 5.0 | 1.9 |
| 147 | 6.5 | 3.0 | 5.2 | 2.0 |
| 148 | 6.2 | 3.4 | 5.4 | 2.3 |
| 149 | 5.9 | 3.0 | 5.1 | 1.8 |
150 rows × 4 columns
# y is the array representing our label. The index of y corresponds to the instance in X.
# e.g. the instance at index 1 is labeled to be an Iris-setosa flower
y
0 Iris-setosa
1 Iris-setosa
2 Iris-setosa
3 Iris-setosa
4 Iris-setosa
...
145 Iris-virginica
146 Iris-virginica
147 Iris-virginica
148 Iris-virginica
149 Iris-virginica
Name: category, Length: 150, dtype: object
Create Hold-out set¶
# bifurcate data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20,train_size=0.8)
# NOTE 1: we could just say test_size = 0.2 and leave off ‘train_size such that the rest (80%) would be training?
# NOTE 2: we could also state test_size=0.20, train_size=0.50 such that 50% of the data would randomly be assigned to training, and 30% of the data would be excluded? (possibly for validation set)
# We now have dataframes for training, and series for testing corresponding to the label for each instance in the dataframes?
# refer to https://scikit-learn.org/stable/api/ for model types, categories, hyperparameters, and configurations to implement additional models
determine datasypes¶
print(f'X_train is of type: {type(X_train)}')
print(f'y_train is of type: {type(y_train)}\n')
print(f'X_test is of type: {type(X_test)}')
print(f'y_test is of type: {type(y_test)}')
X_train is of type: <class 'pandas.core.frame.DataFrame'> y_train is of type: <class 'pandas.core.series.Series'> X_test is of type: <class 'pandas.core.frame.DataFrame'> y_test is of type: <class 'pandas.core.series.Series'>
Let's begin modeling!¶
The general format is similar to the following:
- [INSTANTIATION] instantiate model, set hyperparameters (if applicable)
- [TRAINING TIME] fit the model to the training set (i.e., X_train) - this is where learning happens such that weights/hyperplanes/etc are trained to the dataset
- [INFERENCE TIME] once we train the model, we move to inference such that the model is used to predict instances in the test set, i.e., X_test
- [EVALUATION] Evaluate performance of the trained model against the test set (to determine generalization). Common by measuring loss of the trained model to the ground truth (labels). Multiple evaluation techniques exist depending on the model representation
- Confusion Matrix
- Accuracy, Recall, Precision, F_n Score (e.g., F1-Score)
- AOC (Area under the Curve) and ROC (Receiver Operating Characteristic)
- Negative Predictive Rate, etc
- [INSPECT] Inspect the model performance e.g.:
- Classification Report
- View Feature Importance, etc
- [SIMPLIFY] Simplify the model or retrain if needed
- [DEPLOY] Deploy the model if satisfied with generality of the model
Logistic Regression¶
instantiate model¶
# set randomstate hyperparam for reproducability
model_LR = LogisticRegression(random_state=42)
view hyperparameters to this model¶
print(model_LR.get_params())
{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': 42, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
[TRAINING] train model
model_LR.fit(X_train, y_train)
LogisticRegression(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(random_state=42)
[INFERENCE] predict test set
y_pred = model_LR.predict(X_test)
Phase 4: Model Evaluation
Evaluation Predictions¶
# apply trained model to test set to determine how well it might generalize to unseen data in the future
accuracy_LR = accuracy_score(y_test, y_pred) # can also do accuracy_LR = model_LR.score(X_test, y_test)
precision_LR = precision_score(y_test, y_pred, average='weighted')
recall_LR = recall_score(y_test, y_pred, average='weighted')
f1_score_LR = f1_score(y_test, y_pred, average='weighted')
print(f'Accuracy: {accuracy_LR:.4f}')
print(f'Precision: {precision_LR:.4f}')
print(f'Recall: {recall_LR:.4f}')
print(f'F1-Score: {f1_score_LR:.4f}')
Accuracy: 0.9333 Precision: 0.9333 Recall: 0.9333 F1-Score: 0.9333
View Confusion Matrix, Evaluation Prediction Performance, Create Classification Report¶
# make directory
os.makedirs('./classification_report', exist_ok=True)
mdl = model_LR
mdl_name = 'Logistic_Regression'
confusion_mtx_LR = confusion_matrix(y_test, y_pred)
try:
# convert classification report to dictionary to then convert to df
class_report_dict_LR = classification_report(y_test, y_pred, output_dict=True) # This might raise an error
# Convert dictionary to df
df_class_report_LR = pd.DataFrame(class_report_dict_LR).transpose()
# write to disk!
df_class_report_LR.to_csv('classification_report/classification_report_' + mdl_name + '.csv', index=True)
except:
print('Exception caught in classification report on model:', mdl_name)
confusion_mtx_LR
array([[ 8, 0, 0],
[ 0, 13, 1],
[ 0, 1, 7]])
display_confusion_matrix(mdl_name, confusion_mtx_LR, lst_unique_class_names)
Summarize prediction¶
pd.DataFrame(y_pred).value_counts()
0 Iris-versicolor 14 Iris-setosa 8 Iris-virginica 8 Name: count, dtype: int64
Summarize Ground-Truth labels¶
pd.DataFrame(y_test).value_counts()
category Iris-versicolor 14 Iris-setosa 8 Iris-virginica 8 Name: count, dtype: int64
View Feature Importance to this model¶
feature_names = X.columns
importances = np.abs(model_LR.coef_[0]) # Coefficients of the model
df_importances = pd.DataFrame({'Feature': feature_names,'Importance': importances}).sort_values(by='Importance', ascending=True)
# Plot
plt.figure(figsize=(10, 6))
plt.barh(df_importances['Feature'], df_importances['Importance'], edgecolor='black')
plt.xlabel('Importance')
plt.title('Feature Importance from Logistic Regression model')
plt.show()
View Classification Report¶
df_class_report_LR
| precision | recall | f1-score | support | |
|---|---|---|---|---|
| Iris-setosa | 1.000000 | 1.000000 | 1.000000 | 8.000000 |
| Iris-versicolor | 0.928571 | 0.928571 | 0.928571 | 14.000000 |
| Iris-virginica | 0.875000 | 0.875000 | 0.875000 | 8.000000 |
| accuracy | 0.933333 | 0.933333 | 0.933333 | 0.933333 |
| macro avg | 0.934524 | 0.934524 | 0.934524 | 30.000000 |
| weighted avg | 0.933333 | 0.933333 | 0.933333 | 30.000000 |
NOTE: support indicates the number of ground-truth instances belonging to this class
Evaluate Cross Validation¶
#cv_score_accuracy_LR = cross_val_score(mdl, X_train, y_train, cv=10, scoring='accuracy', n_jobs=-1)
cv_score_accuracy_LR = cross_val_score(model_LR, X, y, cv=10, scoring='accuracy', n_jobs=-1)
cv_score_precision_LR = cross_val_score(model_LR, X, y, cv=10, scoring='precision_macro', n_jobs=-1)
cv_score_recall_LR = cross_val_score(model_LR, X, y, cv=10, scoring='recall_macro', n_jobs=-1)
cv_score_f1_score_LR = cross_val_score(model_LR, X, y, cv=10, scoring='f1_macro', n_jobs=-1)
print('cross-validation evaluation complete\n')
# use average='weighted': One way to account for label imbalance by calculating metrics for each label, average (weighted by support (the number of true instances for each label))
print(f'Accuracy: {cv_score_accuracy_LR.mean():.4f}')
print(f'Precision {cv_score_precision_LR.mean():.4f}')
print(f'Recall: {cv_score_recall_LR.mean():.4f}')
print(f'F1- Score: {cv_score_f1_score_LR.mean():.4f}')
cross-validation evaluation complete Accuracy: 0.9733 Precision 0.9778 Recall: 0.9733 F1- Score: 0.9731
Investigate if Model is overfitting¶
We will do this by comparing learning rate of training data vs learning rate of test data
# set lerning curves to compute at varying amounts of training data
#lst_training_size = [0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80]
lst_training_size = [0.05, 0.10, 0.20, 0.40, 0.80]
# set test size
testSize = 0.1
# set num trials
numTrials = 50
# create list to store eval scores ffor training and test sets
lst_eval_scores_TRAINING_SET = []
lst_eval_scores_TEST_SET = []
# outter loop: training size
for i in range(len(lst_training_size)):
# set training size percentage
trainSize = lst_training_size[i]
# init eval scores at this percentage (index)
total_eval_score_TRAINING = 0
total_eval_score_TEST = 0
# iterate num trails, store average eval score
for j in range(numTrials):
# establish training/testing hold-out
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=testSize,train_size=trainSize,shuffle=True)
# initialize mode
model_LogReg = LogisticRegression() # use default parameters
# train the model to learn the training set
model_LogReg.fit(X_train, y_train)
# use score helper function to apply trained model to the test set and return accuracy. then accumulate model eval performance
eval_score_TEST = model_LogReg.score(X_test,y_test)
total_eval_score_TEST = total_eval_score_TEST + eval_score_TEST
eval_score_TRAINING = model_LogReg.score(X_train,y_train)
total_eval_score_TRAINING = total_eval_score_TRAINING + eval_score_TRAINING
# done with num trials at this training size, store average evaluation score
lst_eval_scores_TEST_SET.append(total_eval_score_TEST/numTrials)
lst_eval_scores_TRAINING_SET.append(total_eval_score_TRAINING/numTrials)
# notify
print("Training complete!")
print("Training set:", lst_eval_scores_TRAINING_SET)
print("Test set: ", lst_eval_scores_TEST_SET)
# plot learning curves
plt.figure(figsize=(10, 6))
plt.plot(lst_training_size,lst_eval_scores_TEST_SET, label='Test Set', marker='o')
plt.plot(lst_training_size,lst_eval_scores_TRAINING_SET, label='Training Set', marker='x')
plt.xlabel('Training Size')
plt.ylabel('Evaluation Score')
plt.title('Learning Curve')
plt.legend()
plt.grid(True)
plt.show();
Training complete! Training set: [0.9428571428571425, 0.9519999999999996, 0.9826666666666669, 0.9730000000000003, 0.9743333333333339] Test set: [0.7826666666666666, 0.8959999999999997, 0.9373333333333328, 0.9559999999999996, 0.9666666666666663]
Compare if score from training set and test set are statistically different¶
# we use paired t-test becauses the underlying data is similar
alpha = 0.05
t_statistic, p_value = st.ttest_rel(lst_eval_scores_TRAINING_SET, lst_eval_scores_TEST_SET)
print(f'p-value is {p_value:.4f}')
if p_value < 0.05:
print('Conclusion: We reject null hypothesis and conclude the difference between these 2 sets are statistically significant.\n\n')
else:
print('Conclusion: We fail to reject the null hypothesis since we could not observe a statistically significant difference between the 2 sets.\n\n')
p-value is 0.1033 Conclusion: We fail to reject the null hypothesis since we could not observe a statistically significant difference between the 2 sets.
With an alpha of 0.05, we do not have enough evidence to reject the null hypothesis.
The null hypothesis states there is no observable difference between the 2 sets (or in other words, the null hypothesis states the differences observed are due to random noise and not due to a specific intervention).
Here, p_value > 0.05, therefore the 2 sets are quite similar. We don't use p-value directly to conclude overfitting vice underfitting, but to at least be close to conclude overfitting or underfitting, then we would expect the difference between these 2 sets to be statistically significant.
If we could determine statistical significance, then we could measure the relation of the training line compared to the test line to state over or underfitting.
Calculate Learning Curve Score Ratio¶
score_ratio = np.array(lst_eval_scores_TRAINING_SET) / np.array(lst_eval_scores_TEST_SET)
print(f'Ratio set: {score_ratio}')
print(f'Ratio average: {score_ratio.mean()}')
Ratio set: [1.20467267 1.0625 1.04836415 1.01778243 1.00793103] Ratio average: 1.068250056927529
There are multiple ways to evaluate values in our learning curve. We will use the score ration to determine how the model is performing
- Ratio Close to 1: If the ratio of training score to test score is close to 1 (e.g., between 0.95 and 1.05), it suggests that the model's performance on training and test sets is similar.
- Moderate Ratio: Ratios between 1.05 and 1.20 might indicate moderate overfitting.
- High Ratio: Ratios greater than 1.20 suggest significant overfitting, as the model performs much better on the training set than on the test set.
Train the Models to Graph Learning Curve of each models' Test Set¶
%%time
# re-init
X = df_iris[['sepal_length','sepal_width','petal_length','petal_width']]
y = df_iris.class_label
num_trials = 50
# logistic regression
lst_training_proportion_LOGISTIC_REGRESSION, lst_eval_scores_TRAINING_SET_LOGISTIC_REGRESSION, lst_std_dev_TRAINING_SET_LOGISTIC_REGRESSION, lst_std_err_TRAINING_SET_LOGISTIC_REGRESSION, lst_eval_scores_TEST_SET_LOGISTIC_REGRESSION, lst_std_dev_TEST_SET_LOGISTIC_REGRESSION, lst_std_err_TEST_SET_LOGISTIC_REGRESSION, lst_score_ratio_LOGISTIC_REGRESSION, p_value_LOGISTIC_REGRESSION, disposition_LOGISTIC_REGRESSION = model_learning_curve('logistic_regression', X, y, num_trials, 0.1, 0.05, False)
# nbc
lst_training_proportion_NBC, lst_eval_scores_TRAINING_SET_NBC, lst_std_dev_TRAINING_SET_NBC, lst_std_err_TRAINING_SET_NBC, lst_eval_scores_TEST_SET_NBC, lst_std_dev_TEST_SET_NBC, lst_std_err_TEST_SET_NBC, lst_score_ratio_NBC, p_value_NBC, disposition_NBC = model_learning_curve('nbc_gaussian', X, y, num_trials, 0.1, 0.05, False)
# svm
lst_training_proportion_SVC, lst_eval_scores_TRAINING_SET_SVC, lst_std_dev_TRAINING_SET_SVC, lst_std_err_TRAINING_SET_SVC, lst_eval_scores_TEST_SET_SVC, lst_std_dev_TEST_SET_SVC, lst_std_err_TEST_SET_SVC, lst_score_ratio_SVC, p_value_SVC, disposition_SVC = model_learning_curve('svc', X, y, num_trials, 0.1, 0.05, False)
# perceptron
lst_training_proportion_PERCEPTRON, lst_eval_scores_TRAINING_SET_PERCEPTRON, lst_std_dev_TRAINING_SET_PERCEPTRON, lst_std_err_TRAINING_SET_PERCEPTRON, lst_eval_scores_TEST_SET_PERCEPTRON, lst_std_dev_TEST_SET_PERCEPTRON, lst_std_err_TEST_SET_PERCEPTRON, lst_score_ratio_PERCEPTRON, p_value_PERCEPTRON, disposition_PERCEPTRON = model_learning_curve('perceptron', X, y, num_trials, 0.1, 0.05, False)
# mlp
lst_training_proportion_MLP, lst_eval_scores_TRAINING_SET_MLP, lst_std_dev_TRAINING_SET_MLP, lst_std_err_TRAINING_SET_MLP, lst_eval_scores_TEST_SET_MLP, lst_std_dev_TEST_SET_MLP, lst_std_err_TEST_SET_MLP, lst_score_ratio_MLP, p_value_MLP, disposition_MLP = model_learning_curve('mlp', X, y, num_trials, 0.1, 0.05, False)
# decision tree
lst_training_proportion_DECISION_TREE, lst_eval_scores_TRAINING_SET_DECISION_TREE, lst_std_dev_TRAINING_SET_DECISION_TREE, lst_std_err_TRAINING_SET_DECISION_TREE, lst_eval_scores_TEST_SET_DECISION_TREE, lst_std_dev_TEST_SET_DECISION_TREE, lst_std_err_TEST_SET_DECISION_TREE, lst_score_ratio_DECISION_TREE, p_value_DECISION_TREE, disposition_DECISION_TREE = model_learning_curve('decision_tree', X, y, num_trials, 0.1, 0.05, False)
print('\nDONE!')
commencing evaluation of model [logistic_regression] Training complete for model logistic_regression! Training set: [0.9314, 0.964, 0.9713, 0.971, 0.9736, 0.9742] Test set: [0.74, 0.8493, 0.9333, 0.936, 0.956, 0.964] ------------------------------------ Train stddev: [0.0877, 0.0451, 0.0278, 0.0174, 0.0134, 0.0076] Train stderr: [0.0877, 0.0319, 0.016, 0.0087, 0.006, 0.0031] Test stddev: [0.1566, 0.1188, 0.0774, 0.0631, 0.0497, 0.0409] Test stderr: [0.1566, 0.084, 0.0447, 0.0316, 0.0222, 0.0167] ------------------------------------ Score Ratio: [1.2587, 1.135, 1.0407, 1.0374, 1.0184, 1.0105] Trn/Tst p-value: 0.0668 p-value disposition: fail to reject null hypothesis; observed difference between 2 samples are not statistically significant ========================================================================================================================================= commencing evaluation of model [nbc_gaussian] Training complete for model nbc_gaussian! Training set: [0.9971, 0.9867, 0.9667, 0.9633, 0.9616, 0.959] Test set: [0.5827, 0.844, 0.944, 0.932, 0.9587, 0.968] ------------------------------------ Train stddev: [0.0202, 0.0301, 0.0309, 0.0196, 0.0103, 0.0077] Train stderr: [0.0202, 0.0213, 0.0178, 0.0098, 0.0046, 0.0031] Test stddev: [0.1974, 0.1344, 0.0622, 0.0639, 0.0444, 0.0452] Test stderr: [0.1974, 0.0951, 0.0359, 0.0319, 0.0199, 0.0184] ------------------------------------ Score Ratio: [1.7113, 1.169, 1.024, 1.0336, 1.003, 0.9907] Trn/Tst p-value: 0.1899 p-value disposition: fail to reject null hypothesis; observed difference between 2 samples are not statistically significant ========================================================================================================================================= commencing evaluation of model [svc] Training complete for model svc! Training set: [0.9743, 0.9827, 0.9773, 0.9793, 0.9849, 0.9878] Test set: [0.8187, 0.936, 0.96, 0.9667, 0.9773, 0.968] ------------------------------------ Train stddev: [0.0554, 0.0325, 0.0265, 0.0156, 0.0086, 0.007] Train stderr: [0.0554, 0.023, 0.0153, 0.0078, 0.0039, 0.0029] Test stddev: [0.1451, 0.0785, 0.0467, 0.0431, 0.0319, 0.0431] Test stderr: [0.1451, 0.0555, 0.0269, 0.0216, 0.0143, 0.0176] ------------------------------------ Score Ratio: [1.1901, 1.0499, 1.0181, 1.0131, 1.0077, 1.0205] Trn/Tst p-value: 0.1204 p-value disposition: fail to reject null hypothesis; observed difference between 2 samples are not statistically significant ========================================================================================================================================= commencing evaluation of model [perceptron] Training complete for model perceptron! Training set: [0.7314, 0.684, 0.708, 0.7397, 0.7267, 0.7505] Test set: [0.5653, 0.628, 0.6253, 0.7453, 0.736, 0.772] ------------------------------------ Train stddev: [0.2094, 0.1687, 0.1526, 0.1304, 0.1449, 0.1195] Train stderr: [0.2094, 0.1193, 0.0881, 0.0652, 0.0648, 0.0488] Test stddev: [0.2017, 0.1661, 0.1914, 0.1626, 0.1761, 0.1611] Test stderr: [0.2017, 0.1175, 0.1105, 0.0813, 0.0788, 0.0658] ------------------------------------ Score Ratio: [1.2938, 1.0892, 1.1322, 0.9924, 0.9873, 0.9722] Trn/Tst p-value: 0.1903 p-value disposition: fail to reject null hypothesis; observed difference between 2 samples are not statistically significant ========================================================================================================================================= commencing evaluation of model [mlp] Training complete for model mlp! Training set: [0.9943, 0.9947, 0.9907, 0.9853, 0.9858, 0.9797] Test set: [0.876, 0.9347, 0.9733, 0.9707, 0.9693, 0.9787] ------------------------------------ Train stddev: [0.0404, 0.0183, 0.0166, 0.0141, 0.009, 0.0072] Train stderr: [0.0404, 0.0129, 0.0096, 0.0071, 0.004, 0.0029] Test stddev: [0.1611, 0.0928, 0.0356, 0.0408, 0.0386, 0.0367] Test stderr: [0.1611, 0.0656, 0.0206, 0.0204, 0.0173, 0.015] ------------------------------------ Score Ratio: [1.135, 1.0642, 1.0178, 1.0151, 1.017, 1.001] Trn/Tst p-value: 0.089 p-value disposition: fail to reject null hypothesis; observed difference between 2 samples are not statistically significant ========================================================================================================================================= commencing evaluation of model [decision_tree] Training complete for model decision_tree! Training set: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] Test set: [0.796, 0.9147, 0.9427, 0.9413, 0.9493, 0.9453] ------------------------------------ Train stddev: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0] Train stderr: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0] Test stddev: [0.1711, 0.0738, 0.0587, 0.0627, 0.0477, 0.0498] Test stderr: [0.1711, 0.0522, 0.0339, 0.0313, 0.0213, 0.0203] ------------------------------------ Score Ratio: [1.2563, 1.0933, 1.0608, 1.0623, 1.0534, 1.0578] Trn/Tst p-value: 0.0172 p-value disposition: reject null hypothesis; observed difference between 2 samples are statistically significant ========================================================================================================================================= DONE! CPU times: user 1min 17s, sys: 16.3 ms, total: 1min 17s Wall time: 1min 31s
Graph the data¶
# Plotting the learning curve
plt.figure(figsize=(15, 5))
plt.errorbar(lst_training_proportion_LOGISTIC_REGRESSION, lst_eval_scores_TEST_SET_LOGISTIC_REGRESSION, yerr=lst_std_err_TEST_SET_LOGISTIC_REGRESSION, fmt='-o', capsize=3, marker='o', label="Logistic Regression")
plt.errorbar(lst_training_proportion_LOGISTIC_REGRESSION, lst_eval_scores_TEST_SET_NBC, yerr=lst_std_err_TEST_SET_NBC, fmt='-o', capsize=3, marker='X', label="NBC")
plt.errorbar(lst_training_proportion_LOGISTIC_REGRESSION, lst_eval_scores_TEST_SET_SVC, yerr=lst_std_err_TEST_SET_SVC, fmt='-o', capsize=3, marker='>', label="SVC")
plt.errorbar(lst_training_proportion_LOGISTIC_REGRESSION, lst_eval_scores_TEST_SET_PERCEPTRON, yerr=lst_std_err_TEST_SET_PERCEPTRON, fmt='-o', capsize=3, marker='<', label="Perceptron")
# Fill the area between the upper and lower bounds of the error bars
plt.fill_between(lst_training_proportion_LOGISTIC_REGRESSION, np.array(lst_eval_scores_TEST_SET_PERCEPTRON) - np.array(lst_std_err_TEST_SET_PERCEPTRON),
np.array(lst_eval_scores_TEST_SET_PERCEPTRON) + np.array(lst_std_err_TEST_SET_PERCEPTRON),
color='red', alpha=0.2, label="Perceptron Model Std Error Area")
plt.xlabel('Training Size Proportion')
plt.ylabel('Average Accuracy')
plt.title('Learning Curve for Test Set' )
#plt.legend(['NBC Test Set', 'Perceptron Test Set', 'Decision Tree Test Set'])
plt.legend(title="Model")
plt.grid(True)
plt.show()
Phase 5: Deploy Model
i = 0
instance = X_test.iloc[[i]]
prediction = model_LR.predict(instance)
ground_truth = y_test.iloc[i]
prediction_correct = (prediction == ground_truth)[0]
print('instance index:', i, ' prediction:', prediction, ' ground truth:', ground_truth, ' prediction correct:', prediction_correct)
instance index: 0 prediction: ['Iris-setosa'] ground truth: Iris-setosa prediction correct: True
Phase 3: Modeling and Classification UPDATE Using Standardization
execute learning and evaluation into a function such that we can streamline evaluation of multiple models¶
standardize the data¶
# we typically standardize our data where values in each column are scaled to a mean of 0. Then the z-score of each value represents a magnitude of how many standard deviations it is from the mean.
# standardization helps us to achieve convergence and reduce values from a particular feature being too dominant in our learning process
# recall, the distribution estimate of our values is the following
df_iris.drop('class_label', axis=1).plot(kind='kde');
observation: notice this data is not Z-Score standardized
# instantiate scaler
scaler = StandardScaler()
# fit the data to the scaler
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)
df_scaled = pd.DataFrame(X_train_scaled, columns=lst_attributes)
df_scaled.plot(kind='kde');
observation: although not perfect, the density estimates are now better aligned at mean = 0
sns.kdeplot(data=df_scaled, fill=True);
df_scaled.plot(kind="box");
sns.boxplot(data=df_scaled);
sns.swarmplot(data=df_scaled, color='black');
plt.show()
# Create a 2x2 subplot layout
df_scaled_copy = df_scaled.copy()
df_scaled_copy['category'] = df_iris['category']
# set data to plot
df_to_plot = df_scaled_copy.copy()
fig, axes = plt.subplots(2, 2, figsize=(14, 14))
axes = axes.flatten()
# Plot boxplot and swarmplot for each feature
for i, feature in enumerate(lst_attributes):
sns.boxplot(ax=axes[i], data=df_to_plot, x='category', y=feature)
sns.swarmplot(ax=axes[i], data=df_to_plot, x='category', y=feature)
#axes[i].set_title(f'{feature.capitalize()} by category')
axes[i].set_title(f'{feature} by category')
axes[i].set_xlabel('category')
axes[i].set_ylabel(feature)
plt.subplots_adjust(wspace=0.3, hspace=0.4)
#plt.tight_layout()
plt.show()
# Set the figure size
plt.figure(figsize=(15, 15))
# set data to plot
df_to_plot = df_scaled_copy.copy()
# List of dependent variables to plot
variables = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
# Loop through each subplot
for i, feature in enumerate(variables):
plt.subplot(2,2,i+1)
sns.violinplot(x='category', y=feature, data=df_to_plot)
plt.title(feature)
# Adjust layout to prevent overlap
#plt.tight_layout()
plt.subplots_adjust(wspace=0.3, hspace=0.2)
plt.show()
combine subplots for boxplot and violin plots¶
# Set the figure size
plt.figure(figsize=(15, 15))
# set data to plot
df_to_plot = df_scaled_copy.copy()
# List of dependent variables to plot
variables = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
# Loop through each subplot
for i, feature in enumerate(variables):
plt.subplot(2,2,i+1)
sns.violinplot(x='category', y=feature, data=df_to_plot)
#sns.boxplot(data=df_to_plot, x='category', y=feature)
sns.swarmplot(data=df_to_plot, x='category', y=feature)
plt.title(feature)
# Adjust layout to prevent overlap
#plt.tight_layout()
plt.subplots_adjust(wspace=0.3, hspace=0.2)
plt.show()
# set data to plot
df_to_plot = df_scaled_copy.copy()
# Set up the figure and axes
n_features = len(lst_attributes)
fig, axes = plt.subplots(n_features, 2, figsize=(13, 5 * n_features))
# Loop through each feature and plot the boxplot, swarmplot, and violin plot
for i, feature in enumerate(lst_attributes):
# Plot boxplot and swarmplot
sns.boxplot(ax=axes[i, 0], data=df_to_plot, x='category', y=feature)
sns.swarmplot(ax=axes[i, 0], data=df_to_plot, x='category', y=feature)
axes[i, 0].set_title(f'{feature} by category')
axes[i, 0].set_xlabel('category')
axes[i, 0].set_ylabel(feature)
# Plot violin plot
sns.violinplot(ax=axes[i, 1], x='category', y=feature, data=df_to_plot)
sns.swarmplot(ax=axes[i, 1], data=df_to_plot, x='category', y=feature)
axes[i, 1].set_title(f'{feature} by category')
axes[i, 1].set_xlabel('category')
axes[i, 1].set_ylabel(feature)
# Adjust layout to prevent overlap
plt.tight_layout()
plt.show()
HELPER FUNCTIONS ¶
Get Time¶
def get_time():
formatted_time = ""
try:
now = datetime.datetime.now()
year = now.year
month = now.month
day = now.day
hour = now.hour
min = now.minute
formatted_time = "{}-{:02}-{:02}-{:02}{:02}".format(year, month, day, hour, min)
except Exception as error:
print_exception(error, inspect.currentframe().f_code.co_name, False)
return formatted_time
Calculate Time Duration¶
def calculate_time_duration(datetime_duration):
formatted_time = ""
try:
days = datetime_duration.days
hours = datetime_duration.seconds // 3600
minutes = (datetime_duration.seconds % 3600) // 60
# seconds = str(divmod(datetime_duration.seconds, 1))
seconds = datetime_duration.seconds % 60
formatted_time = f"{days} day(s), {hours} hour(s), {minutes} minute(s), {seconds} second(s)"
except Exception as error:
print_exception(error, inspect.currentframe().f_code.co_name, False)
return formatted_time
Display Sum NaN Function¶
def display_NaN_columns(dataframe):
try:
lst_nan_sum = dataframe.isna().sum()
filtered_cols_with_na_sum_greater_than_zero = lst_nan_sum[lst_nan_sum > 0]
if len(filtered_cols_with_na_sum_greater_than_zero) < 1:
print("NO COLS WITH NaN!!!!!!!!!!!!")
else:
print(f'Columns with NaN: {len(filtered_cols_with_na_sum_greater_than_zero)}')
print(filtered_cols_with_na_sum_greater_than_zero)
except Exception as error:
print_exception(error, inspect.currentframe().f_code.co_name, False)
Check Unique Cols¶
def print_nunique(SERIES, num_columns):
# iterate through df
if num_columns < 1:
num_columns = 1
try:
count = 0
for col, vol in SERIES.items():
print(f'{col}: {vol}', end='\t\t\t\t')
count += 1
if count % num_columns == 0:
print('')
except Exception as error:
print_exception(error, inspect.currentframe().f_code.co_name, False)
Feature Importance¶
# this is a helper function for
lst_coef_models = ['logistic_regression', 'perceptron', 'sgd_classifier', 'passive_aggressive', 'ridge_classifier', 'linear_svc', 'svc', 'svc_nu']
lst_all_feature_importance = ['bagging_classifier']
lst_permutation_importance = ['nbc_gaussian', 'knn' , 'ada_boosting', 'quadratic_discriminant', 'linear_discriminant', 'hist_boosting', 'gaussian_process', 'mlp', 'nbc_bernoulli']
def display_feature_importance_chart(model, model_name, lst_feature_names, num_coefficients_to_display, figure_save_name_ok_to_be_null):
# not all functions have the same protocol to extract feature importance coefficients
if model_name in lst_coef_models:
ftr_importance = np.abs(model.coef_[0])
elif model_name in lst_all_feature_importance:
all_feature_importances = np.array([tree.feature_importances_ for tree in model.estimators_])
# Average the feature importances across all trees
mean_feature_importances = np.mean(all_feature_importances, axis=0)
ftr_importance = mean_feature_importances
elif model_name in lst_permutation_importance:
results = permutation_importance(model, X_test, y_test, scoring='accuracy', n_repeats=10, random_state=42)
# Get feature importances
importance = results.importances_mean
ftr_importance = np.abs(importance)
#elif model_name in lst_log_prob:
# ftr_importance = model.feature_log_prob_
else:
ftr_importance = model.feature_importances_
return display_feature_importance(model_name, lst_feature_names, ftr_importance, num_coefficients_to_display, figure_save_name_ok_to_be_null)
#display feature importance
def display_feature_importance(model_name, list_col_names, feature_importances, num_features, figure_save_name_ok_to_be_null):
try:
# make directory
os.makedirs('./feature_importance', exist_ok=True)
os.makedirs('./feature_importance_data', exist_ok=True)
#plot importances
feature_names = list_col_names
importances = feature_importances
indices = np.argsort(importances)
# save coefficients to file
# reverse indices
reverse_indices = indices[::-1]
try:
if figure_save_name_ok_to_be_null is not None and len(figure_save_name_ok_to_be_null) > 0:
# Export to a file
with open(str('./feature_importance_data/' + figure_save_name_ok_to_be_null) + '.csv', 'w') as file:
# write header
file.write(str(model_name) + '_feature,importance_coefficient\n')
# write values
for i in reverse_indices:
file.write(str(list_col_names[i]) + "," + str(importances[i]) + '\n')
except:
pass
# reduce the number of values to print to cell
if num_features > 0:
indices = np.argsort(importances)[-num_features:]
figure_height = math.ceil(num_features / 2)
if figure_height < 1:
figure_height = math.ceil(len(feature_importances) / 3)
plt.figure(figsize=(15, figure_height))
plt.title("Feature Importances for model [" + str(model_name) + "]")
plt.barh(range(len(indices)), importances[indices], align="center", edgecolor='black')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
# ensure all text fits within figure
plt.tight_layout()
try:
if figure_save_name_ok_to_be_null is not None and len(figure_save_name_ok_to_be_null) > 0:
# Save the figure
plt.savefig('./feature_importance/' + str(figure_save_name_ok_to_be_null) + '.png', bbox_inches='tight') # add 'tight' to ensure all text fits inside the figure
except:
pass
#plt.show()
#display list of coefficients
print("coefficients:\n====================")
#count = 0
#for i,v in enumerate(importances):
# print('%s:\t %.5f' % (list_col_names[i],v))
# count += 1
# if num_features > 0 and count > num_features:
# break
# resort and take limit number of expected output features
if num_features > 0:
reverse_indices = indices[::-1]
#print
for i in reverse_indices:
print(str(list_col_names[i]) + "\t" + str(importances[i]))
except Exception as error:
print(error)
print(inspect.currentframe().f_code.co_name)
Instantiate Models
RANDOM_STATE = 42
dict_models = {
# linear models
'logistic_regression': LogisticRegression(random_state=RANDOM_STATE),
'perceptron': linear_model.Perceptron(random_state=RANDOM_STATE),
'sgd_classifier': SGDClassifier(random_state=RANDOM_STATE),
'passive_aggressive': linear_model.PassiveAggressiveClassifier(random_state=RANDOM_STATE),
'ridge_classifier': RidgeClassifier(),
# SVM
'linear_svc': svm.LinearSVC(random_state=RANDOM_STATE),
'svc': svm.SVC(probability=True, kernel='linear', random_state=RANDOM_STATE),
'svc_nu': svm.NuSVC(probability=True, kernel='linear', random_state=RANDOM_STATE),
# tree
'decision_tree': DecisionTreeClassifier(random_state=RANDOM_STATE), #random_state=42
# ensemble
'random_forest': RandomForestClassifier(random_state=RANDOM_STATE), # random_state=42
'extra_tree': ExtraTreesClassifier(random_state=RANDOM_STATE),
'bagging_classifier': BaggingClassifier(random_state=RANDOM_STATE),
'gradient_boosting': ensemble.GradientBoostingClassifier(random_state=RANDOM_STATE), # random_state=42
'ada_boosting': ensemble.AdaBoostClassifier(random_state=RANDOM_STATE),
'hist_boosting': HistGradientBoostingClassifier(random_state=RANDOM_STATE),
#'xgb': xgboost.XGBClassifier(),
# Gaussian Processes
'gaussian_process': gaussian_process.GaussianProcessClassifier(random_state=RANDOM_STATE),
# NBC probabilistic
'nbc_gaussian': GaussianNB(),
'nbc_bernoulli': naive_bayes.BernoulliNB(),
#'nbc_multinomial': naive_bayes.MultinomialNB(),
#'nbc_categorical': naive_bayes.CategoricalNB(),
# neighbors
'knn': KNeighborsClassifier(),
#'radius_neighbors': neighbors.RadiusNeighborsClassifier(random_state=RANDOM_STATE),
# discriminant analysis
'quadratic_discriminant': QuadraticDiscriminantAnalysis(),
'linear_discriminant': LinearDiscriminantAnalysis(),
# neural network
'mlp': MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=RANDOM_STATE)
# NOTE: there are many more we can add, this is just a starting point!
}
Classify Dataset!!!¶
def classify_dataset(mdl_name, mdl, X_trn, X_tst, y_trn, y_tst, class_lbls):
analysis_start_time_text = "not started"
analysis_end_time_text = "not started"
analysis_start_time = datetime.datetime.now()
analysis_start_time_text = get_time()
# make directory
os.makedirs('./classification_report', exist_ok=True)
# notify
print(f'\ntraining model: {mdl_name}...')
##################################################
# train the model
##################################################
mdl.fit(X_trn, y_trn)
##################################################
# predict test set
##################################################
y_prd = mdl.predict(X_tst)
##################################################
# evaluate model
##################################################
# apply trained model to test set to determine how well it might generalize to unseen data in the future
accuracy_mdl = accuracy_score(y_tst, y_prd) # can also do accuracy_mdl = mdl.score(X_test, y_tst)
precision_mdl = precision_score(y_tst, y_prd, average='weighted')
recall_mdl = recall_score(y_tst, y_prd, average='weighted')
f1_score_mdl = f1_score(y_tst, y_prd, average='weighted')
cv_score_accuracy_mdl = cross_val_score(mdl, X_trn, y_trn, cv=10, scoring='accuracy', n_jobs=-1)[0]
cv_score_precision_mdl = cross_val_score(mdl, X_trn, y_trn, cv=10, scoring='precision_macro', n_jobs=-1)[0]
cv_score_recall_mdl = cross_val_score(mdl, X_trn, y_trn, cv=10, scoring='recall_macro', n_jobs=-1)[0]
cv_score_f1_score_mdl = cross_val_score(mdl, X_trn, y_trn, cv=10, scoring='f1_macro', n_jobs=-1)[0]
confusion_mtx_mdl = confusion_matrix(y_tst, y_prd)
try:
# convert classification report to dictionary to then convert to df
class_report_dict_mdl = classification_report(y_tst, y_prd, output_dict=True) # This might raise an error
# Convert dictionary to df
df_class_report_mdl = pd.DataFrame(class_report_dict_mdl).transpose()
# write to disk!
df_class_report_mdl.to_csv('classification_report/classification_report_' + mdl_name + '.csv', index=True)
except:
print('Exception caught in classification report on model:', mdl_name)
#########################################################
# analysis summary
#########################################################
analysis_end_time = datetime.datetime.now()
analysis_end_time_text = get_time()
analysis_duration = analysis_end_time - analysis_start_time
analysis_duration_text = calculate_time_duration(analysis_duration)
print('\nDone!\n')
print(f'cv_precision: {cv_score_precision_mdl:.4f}', end='\t')
print(f'cv_recall: {cv_score_recall_mdl:.4f}', end='\t')
print(f'cv_f1_score: {cv_score_f1_score_mdl:.4f}', end='\t')
print(f'accuracy: {accuracy_mdl:.4f}')
print("Start Time:\t\t" + analysis_start_time_text)
print("End Time:\t\t" + analysis_end_time_text)
print("Analysis Duration:\t" + analysis_duration_text)
print('============================================================================================================')
return (accuracy_mdl, precision_mdl, recall_mdl, f1_score_mdl, cv_score_accuracy_mdl, cv_score_precision_mdl, cv_score_recall_mdl, cv_score_f1_score_mdl, confusion_mtx_mdl, df_class_report_mdl,
analysis_start_time_text, analysis_end_time_text, analysis_duration_text)
Train & Evaluate Models¶
#%%time
analysis_start_EVAL_text = "not started"
analysis_end_EVAL_text = "not started"
analysis_start_EVAL = datetime.datetime.now()
analysis_start_EVAL_text = get_time()
dict_trained_model_data = dict()
######################################################
# create dataframe to store eval scores
######################################################
lst_eval_score_cols = ['precision', 'recall', 'f1_score', 'accuracy']
df_evaluation_score = pd.DataFrame(columns=lst_eval_score_cols)
######################################################
# INVOKE MODELS TO LEARN THE DATA
######################################################
for key in dict_models.keys():
mdl_name = key
mdl = dict_models[key]
# train and evaluate each model!
#(accuracy_mdl, precision_mdl, recall_mdl, f1_score_mdl, cv_score_accuracy_mdl, cv_score_precision_mdl, cv_score_recall_mdl, cv_score_f1_score_mdl, confusion_mtx_mdl, df_class_report_mdl, \
# analysis_start_time_text, analysis_end_time_text, analysis_duration_text) = classify_dataset(mdl_name, mdl, X_train_scaled, X_test_scaled, y_train, y_test, lst_unique_class_names)
# store the trained model
dict_trained_model_data[mdl_name] = classify_dataset(mdl_name, mdl, X_train_scaled, X_test_scaled, y_train, y_test, lst_unique_class_names)
cv_score_precision_mdl = dict_trained_model_data[mdl_name][5]
cv_score_recall_mdl = dict_trained_model_data[mdl_name][6]
cv_score_f1_score_mdl = dict_trained_model_data[mdl_name][7]
cv_score_accuracy_mdl = dict_trained_model_data[mdl_name][4]
# append eval score to dataframe
instance_evaluation = {
'model': mdl_name,
'precision': cv_score_precision_mdl,
'recall': cv_score_recall_mdl,
'f1_score': cv_score_f1_score_mdl,
'accuracy': cv_score_accuracy_mdl }
df_evaluation_score.loc[mdl_name] = instance_evaluation
print('============================================================================================================\n\n')
# notify
print('******************************************************************************************************************')
print('******************************************************************************************************************')
print('******************************************************************************************************************')
print("\nLEARNING COMPLETE!!!\n========================================")
analysis_end_EVAL = datetime.datetime.now()
analysis_end_EVAL_text = get_time()
analysis_duration_FINAL = analysis_end_EVAL - analysis_start_EVAL
analysis_duration_EVAL = calculate_time_duration(analysis_duration_FINAL)
print("Trained model count:\t" + str(len(dict_models)))
print("Start Time:\t\t" + analysis_start_EVAL_text)
print("End Time:\t\t" + analysis_end_EVAL_text)
print("Analysis Duration:\t" + analysis_duration_EVAL)
training model: logistic_regression... Done! cv_precision: 0.8056 cv_recall: 0.8056 cv_f1_score: 0.8056 accuracy: 0.8667 Start Time: 2025-02-07-2038 End Time: 2025-02-07-2038 Analysis Duration: 0 day(s), 0 hour(s), 0 minute(s), 0 second(s) ============================================================================================================ ============================================================================================================ training model: perceptron... Done! cv_precision: 0.8611 cv_recall: 0.8333 cv_f1_score: 0.8110 accuracy: 0.8000 Start Time: 2025-02-07-2038 End Time: 2025-02-07-2038 Analysis Duration: 0 day(s), 0 hour(s), 0 minute(s), 0 second(s) ============================================================================================================ ============================================================================================================ training model: sgd_classifier... Done! cv_precision: 0.8056 cv_recall: 0.8056 cv_f1_score: 0.8056 accuracy: 0.9333 Start Time: 2025-02-07-2038 End Time: 2025-02-07-2038 Analysis Duration: 0 day(s), 0 hour(s), 0 minute(s), 0 second(s) ============================================================================================================ ============================================================================================================ training model: passive_aggressive... Done! cv_precision: 0.5238 cv_recall: 0.6667 cv_f1_score: 0.5758 accuracy: 0.7333 Start Time: 2025-02-07-2038 End Time: 2025-02-07-2038 Analysis Duration: 0 day(s), 0 hour(s), 0 minute(s), 0 second(s) ============================================================================================================ ============================================================================================================ training model: ridge_classifier... Done! cv_precision: 0.4667 cv_recall: 0.5000 cv_f1_score: 0.4815 accuracy: 0.8667 Start Time: 2025-02-07-2038 End Time: 2025-02-07-2038 Analysis Duration: 0 day(s), 0 hour(s), 0 minute(s), 0 second(s) ============================================================================================================ ============================================================================================================ training model: linear_svc... Done! cv_precision: 0.8056 cv_recall: 0.8056 cv_f1_score: 0.8056 accuracy: 0.9333 Start Time: 2025-02-07-2038 End Time: 2025-02-07-2038 Analysis Duration: 0 day(s), 0 hour(s), 0 minute(s), 0 second(s) ============================================================================================================ ============================================================================================================ training model: svc... Done! cv_precision: 0.8056 cv_recall: 0.8056 cv_f1_score: 0.8056 accuracy: 0.8667 Start Time: 2025-02-07-2038 End Time: 2025-02-07-2038 Analysis Duration: 0 day(s), 0 hour(s), 0 minute(s), 0 second(s) ============================================================================================================ ============================================================================================================ training model: svc_nu... Done! cv_precision: 0.9333 cv_recall: 0.8889 cv_f1_score: 0.8963 accuracy: 0.8000 Start Time: 2025-02-07-2038 End Time: 2025-02-07-2038 Analysis Duration: 0 day(s), 0 hour(s), 0 minute(s), 0 second(s) ============================================================================================================ ============================================================================================================ training model: decision_tree... Done! cv_precision: 0.8056 cv_recall: 0.8056 cv_f1_score: 0.8056 accuracy: 0.8667 Start Time: 2025-02-07-2038 End Time: 2025-02-07-2038 Analysis Duration: 0 day(s), 0 hour(s), 0 minute(s), 0 second(s) ============================================================================================================ ============================================================================================================ training model: random_forest... Done! cv_precision: 0.8056 cv_recall: 0.8056 cv_f1_score: 0.8056 accuracy: 1.0000 Start Time: 2025-02-07-2038 End Time: 2025-02-07-2038 Analysis Duration: 0 day(s), 0 hour(s), 0 minute(s), 8 second(s) ============================================================================================================ ============================================================================================================ training model: extra_tree... Done! cv_precision: 0.8056 cv_recall: 0.8056 cv_f1_score: 0.8056 accuracy: 0.8667 Start Time: 2025-02-07-2038 End Time: 2025-02-07-2038 Analysis Duration: 0 day(s), 0 hour(s), 0 minute(s), 6 second(s) ============================================================================================================ ============================================================================================================ training model: bagging_classifier... Done! cv_precision: 0.8056 cv_recall: 0.8056 cv_f1_score: 0.8056 accuracy: 0.8000 Start Time: 2025-02-07-2038 End Time: 2025-02-07-2038 Analysis Duration: 0 day(s), 0 hour(s), 0 minute(s), 1 second(s) ============================================================================================================ ============================================================================================================ training model: gradient_boosting... Done! cv_precision: 0.8056 cv_recall: 0.8056 cv_f1_score: 0.8056 accuracy: 0.8667 Start Time: 2025-02-07-2038 End Time: 2025-02-07-2039 Analysis Duration: 0 day(s), 0 hour(s), 0 minute(s), 13 second(s) ============================================================================================================ ============================================================================================================ training model: ada_boosting... Done! cv_precision: 0.8056 cv_recall: 0.8056 cv_f1_score: 0.8056 accuracy: 0.8667 Start Time: 2025-02-07-2039 End Time: 2025-02-07-2039 Analysis Duration: 0 day(s), 0 hour(s), 0 minute(s), 5 second(s) ============================================================================================================ ============================================================================================================ training model: hist_boosting... Done! cv_precision: 0.8056 cv_recall: 0.8056 cv_f1_score: 0.8056 accuracy: 0.8000 Start Time: 2025-02-07-2039 End Time: 2025-02-07-2039 Analysis Duration: 0 day(s), 0 hour(s), 0 minute(s), 5 second(s) ============================================================================================================ ============================================================================================================ training model: gaussian_process... Done! cv_precision: 0.7222 cv_recall: 0.7222 cv_f1_score: 0.7143 accuracy: 0.8000 Start Time: 2025-02-07-2039 End Time: 2025-02-07-2039 Analysis Duration: 0 day(s), 0 hour(s), 0 minute(s), 1 second(s) ============================================================================================================ ============================================================================================================ training model: nbc_gaussian... Done! cv_precision: 0.7222 cv_recall: 0.7222 cv_f1_score: 0.7143 accuracy: 0.8667 Start Time: 2025-02-07-2039 End Time: 2025-02-07-2039 Analysis Duration: 0 day(s), 0 hour(s), 0 minute(s), 0 second(s) ============================================================================================================ ============================================================================================================ training model: nbc_bernoulli... Done! cv_precision: 0.8667 cv_recall: 0.8333 cv_f1_score: 0.8056 accuracy: 0.6667 Start Time: 2025-02-07-2039 End Time: 2025-02-07-2039 Analysis Duration: 0 day(s), 0 hour(s), 0 minute(s), 0 second(s) ============================================================================================================ ============================================================================================================ training model: knn... Done! cv_precision: 0.8056 cv_recall: 0.8056 cv_f1_score: 0.8056 accuracy: 0.8000 Start Time: 2025-02-07-2039 End Time: 2025-02-07-2039 Analysis Duration: 0 day(s), 0 hour(s), 0 minute(s), 0 second(s) ============================================================================================================ ============================================================================================================ training model: quadratic_discriminant... Done! cv_precision: 0.9167 cv_recall: 0.9167 cv_f1_score: 0.9048 accuracy: 0.9333 Start Time: 2025-02-07-2039 End Time: 2025-02-07-2039 Analysis Duration: 0 day(s), 0 hour(s), 0 minute(s), 0 second(s) ============================================================================================================ ============================================================================================================ training model: linear_discriminant... Done! cv_precision: 0.8056 cv_recall: 0.8056 cv_f1_score: 0.8056 accuracy: 0.8667 Start Time: 2025-02-07-2039 End Time: 2025-02-07-2039 Analysis Duration: 0 day(s), 0 hour(s), 0 minute(s), 0 second(s) ============================================================================================================ ============================================================================================================ training model: mlp... Done! cv_precision: 0.8056 cv_recall: 0.8056 cv_f1_score: 0.8056 accuracy: 0.8667 Start Time: 2025-02-07-2039 End Time: 2025-02-07-2039 Analysis Duration: 0 day(s), 0 hour(s), 0 minute(s), 12 second(s) ============================================================================================================ ============================================================================================================ ****************************************************************************************************************** ****************************************************************************************************************** ****************************************************************************************************************** LEARNING COMPLETE!!! ======================================== Trained model count: 22 Start Time: 2025-02-07-2038 End Time: 2025-02-07-2039 Analysis Duration: 0 day(s), 0 hour(s), 0 minute(s), 59 second(s)
view performance of each model¶
# sort df by f1_score
df_evaluation_score = df_evaluation_score.sort_values(by='f1_score', ascending=False)
df_evaluation_score
| precision | recall | f1_score | accuracy | |
|---|---|---|---|---|
| quadratic_discriminant | 0.916667 | 0.916667 | 0.904762 | 0.916667 |
| svc_nu | 0.933333 | 0.888889 | 0.896296 | 0.916667 |
| perceptron | 0.861111 | 0.833333 | 0.810967 | 0.833333 |
| logistic_regression | 0.805556 | 0.805556 | 0.805556 | 0.833333 |
| linear_discriminant | 0.805556 | 0.805556 | 0.805556 | 0.833333 |
| knn | 0.805556 | 0.805556 | 0.805556 | 0.833333 |
| nbc_bernoulli | 0.866667 | 0.833333 | 0.805556 | 0.833333 |
| hist_boosting | 0.805556 | 0.805556 | 0.805556 | 0.833333 |
| ada_boosting | 0.805556 | 0.805556 | 0.805556 | 0.833333 |
| gradient_boosting | 0.805556 | 0.805556 | 0.805556 | 0.833333 |
| bagging_classifier | 0.805556 | 0.805556 | 0.805556 | 0.833333 |
| extra_tree | 0.805556 | 0.805556 | 0.805556 | 0.833333 |
| random_forest | 0.805556 | 0.805556 | 0.805556 | 0.833333 |
| decision_tree | 0.805556 | 0.805556 | 0.805556 | 0.833333 |
| svc | 0.805556 | 0.805556 | 0.805556 | 0.833333 |
| linear_svc | 0.805556 | 0.805556 | 0.805556 | 0.833333 |
| sgd_classifier | 0.805556 | 0.805556 | 0.805556 | 0.833333 |
| mlp | 0.805556 | 0.805556 | 0.805556 | 0.833333 |
| gaussian_process | 0.722222 | 0.722222 | 0.714286 | 0.750000 |
| nbc_gaussian | 0.722222 | 0.722222 | 0.714286 | 0.750000 |
| passive_aggressive | 0.523810 | 0.666667 | 0.575758 | 0.750000 |
| ridge_classifier | 0.466667 | 0.500000 | 0.481481 | 0.583333 |
Select model¶
top_model_eval_instance = df_evaluation_score.iloc[0]
#top_model_eval_instance = df_evaluation_score.iloc[1] #override
top_model_name = top_model_eval_instance.name
print(f'\nTOP MODEL: {top_model_name}\n {top_model_eval_instance} ')
# get trained model
mdl_trained_top_model = dict_models[top_model_name]
confusion_mtx_top_model = dict_trained_model_data[top_model_name][8]
TOP MODEL: quadratic_discriminant precision 0.916667 recall 0.916667 f1_score 0.904762 accuracy 0.916667 Name: quadratic_discriminant, dtype: float64
confusion_mtx_top_model
array([[4, 0, 0],
[0, 4, 0],
[0, 1, 6]])
Create Confusion Matrix to view Model Classification¶
# create confusion matrix
display_confusion_matrix(top_model_name, confusion_mtx_top_model, lst_unique_class_names)
Determine Feature Importance for Selected Model¶
ftr_importance = None
mdlName = top_model_name
model = dict_models[mdlName]
display_feature_importance_chart(model, mdlName, list(X_train.columns), 10, 'feature_importance_' + str(mdlName))
coefficients: ==================== petal_length 0.01333333333333333 petal_width 0.0 sepal_width 0.0 sepal_length 0.0
Finally, Deploy the selected model¶
i = 0
instance = X_train.iloc[[i]]
prediction = mdl_trained_top_model.predict(instance)
ground_truth = y_test.iloc[i]
prediction_correct = (prediction == ground_truth)[0]
print('instance index:', i, ' prediction:', prediction, ' ground truth:', ground_truth, ' prediction correct:', prediction_correct)
instance index: 0 prediction: ['Iris-virginica'] ground truth: Iris-setosa prediction correct: False
# remember to scale the data
i = 0
instance = X_train_scaled[[i]]
prediction = mdl_trained_top_model.predict(instance)
ground_truth = y_test.iloc[i]
prediction_correct = (prediction == ground_truth)[0]
print('instance index:', i, ' prediction:', prediction, ' ground truth:', ground_truth, ' prediction correct:', prediction_correct)
instance index: 0 prediction: ['Iris-setosa'] ground truth: Iris-setosa prediction correct: True
defining a new instance¶
instance = np.array([[7.7, 3.1, 5.1, 1.8]])
instance
array([[7.7, 3.1, 5.1, 1.8]])
scaling the instance¶
scaled_instance = scaler.transform(instance)
scaled_instance
array([[1.81813736, 0.30716727, 0.58392945, 0.55594151]])
predicting the instance classificaiton¶
# if the data is not standardized
mdl_trained_top_model.predict(instance)[0]
# if the data is standardized
mdl_trained_top_model.predict(scaled_instance)[0]
'Iris-versicolor'